Descrition of data cleaning:
reading in data
selecting and renaming the columns
since names are in last, first format, reorder them to be first last
make sure all “Jr”s go at the end
remove accents using stringi::stri_trans_general
salaries = read_excel(
"data/MLB-Salaries 2000-24.xlsx",
sheet = "2022.xls",
skip = 1) |>
select(1:4) |>
rename(position = "Pos'n",
salary_2022 = "2022.0",
service_time_yrs = "MLS",
name = "Player") %>%
mutate(name = str_split(name, ","),
name = map(name, rev),
name = map_chr(name, str_c, collapse = " "),
name = str_trim(name),
name = str_replace_all(name, "[*#\\.,]", ""),
junior = str_detect(name, "Jr"),
name = if_else(junior == TRUE, str_remove(name, " Jr"), name),
name = if_else(junior == TRUE, str_c(name, " Jr"), name),
name = stringi::stri_trans_general(name,id = "Latin-ASCII")) |>
select(-junior) %>%
mutate(simple_position = str_split_i(position, "-", 1),
simple_position = fct_relevel(simple_position,
c("c", "1b", "2b", "3b", "ss",
"lf", "cf", "rf", "inf", "of",
"dh", "rhp", "lhp")),
service_time_floor = floor(service_time_yrs))
Descrition of data cleaning:
reading in data and clean names
grouping by name and using a count to ensure we only end up with one of each name (slightly complicated due to interleague trades and potential for players with the same name)
remove whitespace, from names and use stringi again to remove accents so that the formatting exactly matches the salaries tibble
Filter out those with fewer than 100 plate appearances
batting = read_delim("data/2022 MLB Player Stats - Batting.csv", delim = ";",
locale = locale(encoding = "latin1")) |>
janitor::clean_names() %>%
group_by(name) |>
mutate(name_count = n(),
keep_row = case_when(name_count == 1 ~ TRUE,
name_count > 1 & tm == "TOT" ~ TRUE,
.default = FALSE)) |>
filter(keep_row == TRUE) %>%
mutate(name_count = n(),
keep_row = case_when(name_count == 1 ~ TRUE,
name_count > 1 & lg == "MLB" ~ TRUE,
.default = FALSE)) |>
ungroup() |>
filter(keep_row == TRUE) %>%
mutate(name = str_split(name, "\\s+"),
name = map_chr(name, str_c, collapse = " "),
name = str_trim(name),
name = str_replace_all(name, "[*#\\.]", ""),
name = stringi::stri_trans_general(name,id = "Latin-ASCII"))
merged_batting <- salaries |>
mutate(simple_position = str_split_i(position, "-", 1)) |>
filter(!simple_position %in% c("rhp","lhp")) |>
inner_join(batting, by = "name") |>
filter(pa >= 100)
salary_not_in_batting <- anti_join(salaries, batting, by = "name")
batting_not_in_salary <- anti_join(batting, salaries, by = "name")
(same data cleaning process as above)
pitching = read_delim("data/2022 MLB Player Stats - Pitching.csv", delim = ";",
locale = locale(encoding = "latin1")) |>
janitor::clean_names() %>%
group_by(name) |>
mutate(name_count = n(),
keep_row = case_when(name_count == 1 ~ TRUE,
name_count > 1 & tm == "TOT" ~ TRUE,
.default = FALSE)) |>
filter(keep_row == TRUE) %>%
mutate(name_count = n(),
keep_row = case_when(name_count == 1 ~ TRUE,
name_count > 1 & lg == "MLB" ~ TRUE,
.default = FALSE)) |>
ungroup() |>
filter(keep_row == TRUE) %>%
mutate(name = str_split(name, "\\s+"),
name = map_chr(name, str_c, collapse = " "),
name = str_trim(name),
name = str_replace_all(name, "[*#\\.]", ""),
name = stringi::stri_trans_general(name,id = "Latin-ASCII"))
merged_pitching <- inner_join(salaries, pitching, by = "name") |>
filter(ip >= 20) |>
separate(ip, into = c("ip", "ip_dec"), remove = FALSE, convert = TRUE) |>
mutate(ip_dec_333 = ifelse(is.na(ip_dec), 0, ip_dec * 333) ,
ip_total = paste(ip, ip_dec_333, sep = "."),
ip_total = as.numeric(ip_total)) |>
select(-ip, -ip_dec, -ip_dec_333) |>
separate(position, into = c("hand", "pitcher_type")) |>
mutate(pitcher_type = ifelse(is.na(pitcher_type), "r", pitcher_type)) |>
filter(hand == "lhp" | hand == "rhp")
salary_not_in_pitching <- anti_join(salaries, pitching, by = "name")
pitching_not_in_salary <- anti_join(pitching, salaries, by = "name")
Note that there are a number of names in salaries not
found in batting or pitching, and vice versa.
Some of this may be genuine missingness (e.g. salaries has fewer rows
than pitching and batting combined), but some is also due to alternative
spellings of names in the datasets. Possibly could be fixed using other
matching methods, but the analysis pipeline will still work from
here.
Some Basic EDA for salaries on their own
Service time (i.e. number of years of experience)
salaries |>
count(service_time_floor) |>
ggplot(aes(x = service_time_floor, y = n)) +
geom_col() +
labs(title = "Number of Players per Year of Experience",
x = "Years in the MLB",
y = "Number of Players") +
scale_x_continuous(breaks = seq(0,21,by=1)) +
theme_bw()
This is right skewed, so may need to transform it if used in
regression
Salary
salaries |>
ggplot(aes(x = salary_2022)) +
geom_density() +
labs(title = "2022 Salary Distribution",
x = "2022 Salary",
y = "Density") +
scale_x_continuous(labels = label_comma()) +
theme_bw()
salaries |>
mutate(position_group = case_match(simple_position,
c("rhp","lhp") ~ "pitcher",
c("cf","lf","rf","of") ~ "outfield",
"dh" ~ "dh",
.default = "infield")) |>
ggplot(aes(x = salary_2022, color = position_group)) +
geom_density() +
labs(title = "Salary by Position Group",
x = "2022 Salary",
y = "Density") +
scale_x_continuous(labels = label_comma()) +
scale_color_discrete(name = "Position Group",
labels = c("Designated Hitter",
"Infield",
"Outfield",
"Pitcher")) +
theme_bw()
This is also heavily right skewed, with many players having salary under
$1,000,000
Position type table
merged_batting |>
count(simple_position) |>
mutate(simple_position = fct_infreq(simple_position,w = n)) |>
ggplot(aes(x = simple_position, y = n)) +
geom_col()
salaries %>%
ggplot(aes(x = simple_position, y = salary_2022)) +
geom_boxplot()
Infielders vs. outfielders vs. pitchers?
salaries |>
mutate(position_group = case_match(simple_position,
c("rhp","lhp") ~ "pitcher",
c("cf","lf","rf","of") ~ "outfield",
"dh" ~ "dh",
.default = "infield")) |>
ggplot(aes(x = position_group, y = salary_2022)) +
geom_boxplot()
salaries %>%
ggplot(aes(x = factor(service_time_floor), y = salary_2022)) +
geom_boxplot()
salaries %>%
group_by(service_time_floor) %>%
summarize(avg_salary = mean(salary_2022, na.rm = TRUE)) %>%
ggplot(aes(x = service_time_floor, y = avg_salary)) +
geom_col()
salaries |>
ggplot(aes(x = service_time_floor, y = salary_2022)) +
geom_point(alpha = 0.5)
salaries %>%
ggplot(aes(x=floor(service_time_yrs),fill = simple_position)) +
geom_bar(position = "fill")
salaries |>
ggplot(aes(x = service_time_floor,
y = simple_position,
fill = simple_position)) +
ggridges::geom_density_ridges(alpha = 0.5)
salaries %>%
ggplot(aes(x=factor(floor(service_time_yrs)), y=salary_2022)) +
geom_boxplot()
Looking at some hitting statistics
merged_batting |>
ggplot(aes(x = hr)) +
geom_histogram() +
labs(x = "Number of home runs (HRs)")
merged_batting |>
ggplot(aes(x = rbi)) +
geom_histogram() +
labs(x = "Runs batted in (RBI)")
These are also left skewed so will need to normalize for any linear model.
OPS is an “all in one” statistic combining on-base percentage (OBP) and slugging (SLG).
OBP is calculated as \(\frac{Hits (H) + Walks (BB) + Hit by pitch (HBP)}{At \ bats (AB) + Walks (BB) + sacrifice \ flies (SF) + Hit by pitch (HBP)}\)
Slugging is calculated as \(\frac{total \ bases (TB)}{At \ bats (AB)}\)
OPS is the sum of these two statistics.
OPS+ (or adjusted OPS) is adjusted for the park and league averages.
merged_batting |>
ggplot(aes(x = ops)) +
geom_histogram() +
labs(x = "OPS")
merged_batting |>
ggplot(aes(x = ops_2)) +
geom_histogram() +
labs(x = "OPS+")
One idea might be to compare OPS or OPS+ to more traditional statistics, like RBIs, HRs, or batting averages to OPS or OPS+ as single predictors of salary.
One other factor to consider is the correlation between some of these variables
merged_batting |>
select(hr, rbi, pa, ba, ops) |>
GGally::ggpairs()
Some of these potential predictors are fairly strongly correlated (like RBI and HR, or BA and OPS), so it’s important not to include to many collinear variables in a potential linear model.
Looking at some relationships to salary
Position vs. salary
merged_batting |>
mutate(positions = str_split_i(position, "-",1)) |>
ggplot(aes(x = positions,y= salary_2022)) +
geom_boxplot()
merged_batting |>
mutate(positions = str_split_i(position, "-",1)) |>
group_by(positions) |>
summarize(avg_salary = mean(salary_2022, na.rm = TRUE),
median_salary = median(salary_2022, na.rm = TRUE))
## # A tibble: 11 × 3
## positions avg_salary median_salary
## <chr> <dbl> <dbl>
## 1 1b 8845755. 6225000
## 2 2b 5010094. 2550000
## 3 3b 8302384. 3375000
## 4 c 3669983. 1750000
## 5 cf 8145270. 5500000
## 6 dh 11857143. 12000000
## 7 inf 1312500 1312500
## 8 lf 6971404. 6125000
## 9 of 2211196. 746100
## 10 rf 7833411. 4875000
## 11 ss 7266837 4000000
HR vs. salary
merged_batting |>
ggplot(aes(x = hr, y = salary_2022)) +
geom_point()
RBI vs. salary
merged_batting |>
ggplot(aes(x = rbi, y = salary_2022)) +
geom_point()
BA vs. salary
merged_batting |>
ggplot(aes(x = ba, y = salary_2022)) +
geom_point()
Plate appearances vs. salary
merged_batting |>
ggplot(aes(x = pa, y = salary_2022)) +
geom_point()
OPS+ vs salary
merged_batting |>
ggplot(aes(x = ops_2, y = salary_2022)) +
geom_point()
Since salary (and other variables) are very skewed, we may want to transform these data using a log transformation.
Home runs
hr_fit <- lm(log(salary_2022,10) ~ hr,
data = merged_batting)
hr_fit |>
broom::tidy()
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.22 0.0445 140. 2.09e-310
## 2 hr 0.0206 0.00291 7.08 7.87e- 12
OPS+
ops_fit <- lm(log(salary_2022,10) ~ ops_2,
data = merged_batting)
ops_fit |>
broom::tidy()
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 5.96 0.100 59.4 1.38e-185
## 2 ops_2 0.00531 0.00100 5.28 2.22e- 7
Would also want to account for years of experience in predicting salary
experience_fit <- lm(log(salary_2022,10) ~ service_time_floor,
data = merged_batting)
experience_fit |>
broom::tidy()
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 5.98 0.0312 192. 0
## 2 service_time_floor 0.115 0.00562 20.5 6.59e-62
A multivariable model including all of these
multivar_fit <- lm(log(salary_2022,10) ~ hr + ops_2 + service_time_floor,
data = merged_batting)
multivar_fit |>
broom::tidy()
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 5.69 0.0759 75.0 1.13e-217
## 2 hr 0.0115 0.00285 4.04 6.53e- 5
## 3 ops_2 0.00176 0.000948 1.85 6.47e- 2
## 4 service_time_floor 0.110 0.00522 21.1 2.06e- 64
In order to eliminate pitchers with very few appearances or position players (catchers, first basemen, etc.) who came in to pitch in blowouts, I filtered only for pitchers with at least 10 innings pitches.
merged_pitching |>
ggplot(aes(x = ip_total)) +
geom_histogram(binwidth = 2, fill = "lightblue", col = "black")
merged_pitching |>
ggplot(aes(x = era_2)) +
geom_histogram(fill = "lightblue", col = "black")
merged_pitching |>
ggplot(aes(x = ip_total, y = salary_2022)) +
geom_point() +
scale_x_continuous(lim = c(0, 400))
merged_pitching |>
ggplot(aes(x = era_2, y = salary_2022)) +
geom_point() +
scale_x_continuous(lim = c(0, 400))
merged_pitching |>
ggplot(aes(x = ip_total, y = log(salary_2022))) +
geom_point() +
scale_x_continuous(lim = c(0, 400))
merged_pitching |>
ggplot(aes(x = era_2, y = log(salary_2022))) +
geom_point() +
scale_x_continuous(lim = c(0, 400))
lm_ip = lm(salary_2022 ~ ip_total, data = merged_pitching)
summary(lm_ip)
##
## Call:
## lm(formula = salary_2022 ~ ip_total, data = merged_pitching)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7881584 -2346730 -1339157 542534 36868673
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 453710 505732 0.897 0.37
## ip_total 41360 5280 7.833 4e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5394000 on 417 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 0.1283, Adjusted R-squared: 0.1262
## F-statistic: 61.35 on 1 and 417 DF, p-value: 4.002e-14
lm_ip_data = tibble(fitted.values(lm_ip), residuals(lm_ip))
names(lm_ip_data) = c("fitted", "resid")
lm_ip_data |>
ggplot(aes(x = fitted, y = resid)) +
geom_point()
view(lm_ip_data)
lm_era_2 = lm(salary_2022 ~ era_2, data = merged_pitching)
summary(lm_era_2)
##
## Call:
## lm(formula = salary_2022 ~ era_2, data = merged_pitching)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4017039 -3088950 -2721275 385077 39372391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3560272 674783 5.276 2.12e-07 ***
## era_2 2371 5297 0.448 0.655
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5776000 on 417 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 0.0004801, Adjusted R-squared: -0.001917
## F-statistic: 0.2003 on 1 and 417 DF, p-value: 0.6547
lm_era_2_data = tibble(fitted.values(lm_era_2), residuals(lm_era_2))
names(lm_era_2_data) = c("fitted", "resid")
lm_era_2_data |>
ggplot(aes(x = fitted, y = resid)) +
geom_point()
view(lm_era_2_data)
lm_ip_log = lm(log(salary_2022) ~ ip_total, data = merged_pitching)
summary(lm_ip_log)
##
## Call:
## lm(formula = log(salary_2022) ~ ip_total, data = merged_pitching)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8882 -0.7565 -0.4034 0.7288 3.1811
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.776177 0.097656 141.069 < 2e-16 ***
## ip_total 0.008032 0.001020 7.877 2.93e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.042 on 417 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 0.1295, Adjusted R-squared: 0.1274
## F-statistic: 62.05 on 1 and 417 DF, p-value: 2.933e-14
lm_ip_log_data = tibble(fitted.values(lm_ip_log), residuals(lm_ip_log))
names(lm_ip_log_data) = c("fitted", "resid")
lm_ip_log_data |>
ggplot(aes(x = fitted, y = resid)) +
geom_point()
view(lm_ip_log_data)
lm_era_2_log = lm(log(salary_2022) ~ era_2, data = merged_pitching)
summary(lm_era_2_log)
##
## Call:
## lm(formula = log(salary_2022) ~ era_2, data = merged_pitching)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9979 -0.9559 -0.5614 0.8333 3.1417
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.441e+01 1.304e-01 110.497 <2e-16 ***
## era_2 1.869e-04 1.024e-03 0.183 0.855
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.116 on 417 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 7.994e-05, Adjusted R-squared: -0.002318
## F-statistic: 0.03334 on 1 and 417 DF, p-value: 0.8552
lm_era_2_data_log = tibble(fitted.values(lm_era_2_log), residuals(lm_era_2_log))
names(lm_era_2_data_log) = c("fitted", "resid")
lm_era_2_data_log |>
ggplot(aes(x = fitted, y = resid)) +
geom_point()
lm_pitching = lm(salary_2022 ~ hand + pitcher_type + hand*pitcher_type + ip_total + era_2,
data = merged_pitching)
summary(lm_pitching)
##
## Call:
## lm(formula = salary_2022 ~ hand + pitcher_type + hand * pitcher_type +
## ip_total + era_2, data = merged_pitching)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9175137 -1716370 -699492 602203 35065484
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15005127 4956120 3.028 0.00262 **
## handrhp -8606379 5351662 -1.608 0.10857
## pitcher_typer -14812000 4970647 -2.980 0.00305 **
## pitcher_types -8591542 5028351 -1.709 0.08828 .
## ip_total 16873 5758 2.930 0.00357 **
## era_2 4389 4639 0.946 0.34466
## handrhp:pitcher_typer 8656223 5386994 1.607 0.10885
## handrhp:pitcher_types 7266736 5438133 1.336 0.18221
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4937000 on 411 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 0.2803, Adjusted R-squared: 0.268
## F-statistic: 22.87 on 7 and 411 DF, p-value: < 2.2e-16
From our model, we can see that relief pitchers have significantly
lower salaries than closing pitchers, which makes sense, since closing
pitchers have very strong reputations as some of the most effective
pitchers in the game and therefore will draw high salaries.
Additionally, the era_2 predictor, even when accounting for
pitcher handedness, pitcher type, and the number of innings pitched, has
an insignificant \(P\)-value, implying
that there is no significant association between one of the most highly
regarded summative statistics for pitching effectiveness and pitcher
salary.